{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GPU Light gradient boosting trained on TF-IDF reduced 50 dimensions\n",
    "\n",
    "1. Same emotion dataset from [NLP-dataset](https://github.com/huseinzol05/NLP-Dataset)\n",
    "2. Same splitting 80% training, 20% testing, may vary depends on randomness\n",
    "3. Same regex substitution '[^\\\"\\'A-Za-z0-9 ]+'\n",
    "\n",
    "## Example\n",
    "\n",
    "Based on Term-frequency Inverse document frequency\n",
    "\n",
    "After that we apply SVD to reduce the dimensions, n_components = 50"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import lightgbm as lgb\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "import numpy as np\n",
    "import re\n",
    "import time\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import sklearn.datasets\n",
    "from sklearn import pipeline\n",
    "from sklearn.model_selection import StratifiedKFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def clearstring(string):\n",
    "    string = re.sub('[^\\\"\\'A-Za-z0-9 ]+', '', string)\n",
    "    string = string.split(' ')\n",
    "    string = filter(None, string)\n",
    "    string = [y.strip() for y in string]\n",
    "    string = ' '.join(string)\n",
    "    return string\n",
    "\n",
    "# because of sklean.datasets read a document as a single element\n",
    "# so we want to split based on new line\n",
    "def separate_dataset(trainset):\n",
    "    datastring = []\n",
    "    datatarget = []\n",
    "    for i in range(len(trainset.data)):\n",
    "        data_ = trainset.data[i].split('\\n')\n",
    "        # python3, if python2, just remove list()\n",
    "        data_ = list(filter(None, data_))\n",
    "        for n in range(len(data_)):\n",
    "            data_[n] = clearstring(data_[n])\n",
    "        datastring += data_\n",
    "        for n in range(len(data_)):\n",
    "            datatarget.append(trainset.target[i])\n",
    "    return datastring, datatarget"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "trainset_data = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset_data.data, trainset_data.target = separate_dataset(trainset_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(trainset_data.data, trainset_data.target, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "decompose = pipeline.Pipeline([('count', TfidfVectorizer()),\n",
    "                               ('svd', TruncatedSVD(n_components=50))]).fit(trainset_data.data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "params_lgb = {\n",
    "    'max_depth': 27, \n",
    "    'learning_rate': 0.03,\n",
    "    'verbose': 50, \n",
    "    'early_stopping_round': 200,\n",
    "    'metric': 'multi_logloss',\n",
    "    'objective': 'multiclass',\n",
    "    'num_classes': len(trainset_data.target_names),\n",
    "    'device': 'gpu',\n",
    "    'gpu_platform_id': 0,\n",
    "    'gpu_device_id': 0\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_X = decompose.transform(train_X)\n",
    "test_X = decompose.transform(test_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training until validation scores don't improve for 200 rounds.\n",
      "[100]\ttraining's multi_logloss: 1.51641\tvalid_1's multi_logloss: 1.52991\n",
      "[200]\ttraining's multi_logloss: 1.48039\tvalid_1's multi_logloss: 1.50404\n",
      "[300]\ttraining's multi_logloss: 1.46016\tvalid_1's multi_logloss: 1.49379\n",
      "[400]\ttraining's multi_logloss: 1.44402\tvalid_1's multi_logloss: 1.48755\n",
      "[500]\ttraining's multi_logloss: 1.43032\tvalid_1's multi_logloss: 1.4837\n",
      "[600]\ttraining's multi_logloss: 1.41806\tvalid_1's multi_logloss: 1.4811\n",
      "[700]\ttraining's multi_logloss: 1.40679\tvalid_1's multi_logloss: 1.4791\n",
      "[800]\ttraining's multi_logloss: 1.39626\tvalid_1's multi_logloss: 1.47765\n",
      "[900]\ttraining's multi_logloss: 1.38603\tvalid_1's multi_logloss: 1.4765\n",
      "[1000]\ttraining's multi_logloss: 1.37627\tvalid_1's multi_logloss: 1.47559\n",
      "[1100]\ttraining's multi_logloss: 1.36678\tvalid_1's multi_logloss: 1.47482\n",
      "[1200]\ttraining's multi_logloss: 1.35761\tvalid_1's multi_logloss: 1.4741\n",
      "[1300]\ttraining's multi_logloss: 1.34862\tvalid_1's multi_logloss: 1.47349\n",
      "[1400]\ttraining's multi_logloss: 1.33981\tvalid_1's multi_logloss: 1.47288\n",
      "[1500]\ttraining's multi_logloss: 1.33125\tvalid_1's multi_logloss: 1.47229\n",
      "[1600]\ttraining's multi_logloss: 1.32281\tvalid_1's multi_logloss: 1.47181\n",
      "[1700]\ttraining's multi_logloss: 1.31465\tvalid_1's multi_logloss: 1.47146\n",
      "[1800]\ttraining's multi_logloss: 1.30664\tvalid_1's multi_logloss: 1.47115\n",
      "[1900]\ttraining's multi_logloss: 1.29872\tvalid_1's multi_logloss: 1.47091\n",
      "[2000]\ttraining's multi_logloss: 1.29104\tvalid_1's multi_logloss: 1.47071\n",
      "[2100]\ttraining's multi_logloss: 1.28331\tvalid_1's multi_logloss: 1.47047\n",
      "[2200]\ttraining's multi_logloss: 1.2759\tvalid_1's multi_logloss: 1.47042\n",
      "[2300]\ttraining's multi_logloss: 1.26851\tvalid_1's multi_logloss: 1.47032\n",
      "[2400]\ttraining's multi_logloss: 1.26119\tvalid_1's multi_logloss: 1.47017\n",
      "[2500]\ttraining's multi_logloss: 1.25404\tvalid_1's multi_logloss: 1.47011\n",
      "[2600]\ttraining's multi_logloss: 1.247\tvalid_1's multi_logloss: 1.47003\n",
      "[2700]\ttraining's multi_logloss: 1.24004\tvalid_1's multi_logloss: 1.46998\n",
      "[2800]\ttraining's multi_logloss: 1.23314\tvalid_1's multi_logloss: 1.46996\n",
      "[2900]\ttraining's multi_logloss: 1.22632\tvalid_1's multi_logloss: 1.46997\n",
      "[3000]\ttraining's multi_logloss: 1.21957\tvalid_1's multi_logloss: 1.46994\n",
      "Early stopping, best iteration is:\n",
      "[2849]\ttraining's multi_logloss: 1.22982\tvalid_1's multi_logloss: 1.46992\n",
      "415.922 Seconds to train lgb\n"
     ]
    }
   ],
   "source": [
    "d_train = lgb.Dataset(train_X, train_Y)\n",
    "d_valid = lgb.Dataset(test_X, test_Y)\n",
    "watchlist = [d_train, d_valid]\n",
    "t=time.time()\n",
    "clf = lgb.train(params_lgb, d_train, 100000, watchlist, early_stopping_rounds=200, verbose_eval=100)\n",
    "print(round(time.time()-t, 3), 'Seconds to train lgb')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.38      0.05      0.09     11460\n",
      "       fear       0.32      0.06      0.10      9545\n",
      "        joy       0.44      0.73      0.55     28052\n",
      "       love       0.17      0.01      0.02      7015\n",
      "    sadness       0.39      0.54      0.45     24291\n",
      "   surprise       0.09      0.01      0.01      2999\n",
      "\n",
      "avg / total       0.37      0.42      0.34     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "print(metrics.classification_report(test_Y, np.argmax(clf.predict(test_X), axis = 1), target_names = trainset_data.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "clf.save_model('lgb-tfidf-svd50.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
